{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "from py2neo import Graph\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph = Graph(\"bolt://localhost:7687\", auth=(\"neo4j\", \"neo\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Discrete Features\n",
    "\n",
    "Let's start by predicting reviews based only on discrete features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>aveMyStars</th>\n",
       "      <th>aveOtherStars</th>\n",
       "      <th>stars</th>\n",
       "      <th>userFriends</th>\n",
       "      <th>userId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>true</td>\n",
       "      <td>0</td>\n",
       "      <td>vefBMC37_FtVXA6gdGcRKg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.600000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>true</td>\n",
       "      <td>44</td>\n",
       "      <td>5gKZq8-yNns_z5UBlRQggQ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.821918</td>\n",
       "      <td>4.0</td>\n",
       "      <td>true</td>\n",
       "      <td>149</td>\n",
       "      <td>v3YOx9T4jRSw8XP9Rd9H4g</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.304348</td>\n",
       "      <td>NaN</td>\n",
       "      <td>true</td>\n",
       "      <td>17</td>\n",
       "      <td>8bRfSEzLoDoa8cjHOWOljw</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.250000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>false</td>\n",
       "      <td>1</td>\n",
       "      <td>jbn7XQV7CngRu0sA6cZN1Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   aveMyStars  aveOtherStars  stars  userFriends                  userId\n",
       "0    5.000000            NaN   true            0  vefBMC37_FtVXA6gdGcRKg\n",
       "1    3.600000            NaN   true           44  5gKZq8-yNns_z5UBlRQggQ\n",
       "2    3.821918            4.0   true          149  v3YOx9T4jRSw8XP9Rd9H4g\n",
       "3    3.304348            NaN   true           17  8bRfSEzLoDoa8cjHOWOljw\n",
       "4    3.250000            NaN  false            1  jbn7XQV7CngRu0sA6cZN1Q"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "query = \"\"\"\n",
    "MATCH (:Category {name: $category})<-[:IN_CATEGORY]-(business:Business)-[:IN_CITY]->(:City {name: $city})\n",
    "MATCH (business:Business)<-[:REVIEWS]-(review:Review)<-[:WROTE]-(user:User)\n",
    "WITH business, user,\n",
    "     size((user)-[:FRIENDS]->()) AS userFriends,\n",
    "     review.stars AS stars,\n",
    "     review\n",
    "LIMIT 1000\n",
    "OPTIONAL MATCH (business)<-[:REVIEWS]-(otherRev)<-[:WROTE]-(other:User)-[:FRIENDS]->(user) WHERE otherRev.date < review.date\n",
    "WITH  user, business, userFriends, avg(otherRev.stars) AS aveOtherStars, stars\n",
    "OPTIONAL MATCH (user)-[:WROTE]->(review) WHERE not((review)-[:REVIEWS]->(business))\n",
    "RETURN user.id AS userId,\n",
    "       userFriends,\n",
    "       aveOtherStars,\n",
    "       avg(review.stars) AS aveMyStars,\n",
    "       CASE WHEN stars > 3 THEN \"true\" ELSE \"false\" END as stars\n",
    "\"\"\"\n",
    "\n",
    "df = graph.run(query, {\"city\": \"Las Vegas\", \"category\": \"Restaurants\"}).to_data_frame()\n",
    "display(df.head())\n",
    "\n",
    "# How many tips has the user given?\n",
    "# How many tips does the business have? \n",
    "# Features to add\n",
    "# Is the business in the same cluster as the user?\n",
    "# Influential friend ranked it highly? (PR bigger than $score)\n",
    "# Influential friends who ranked it lowly?\n",
    "# Photos and tips?\n",
    "# Triadic balance / Triangle count? \n",
    "# Cluster based on people reviewing the same places"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = ['userFriends', \"aveOtherStars\", \"aveMyStars\"]\n",
    "X = df[columns]\n",
    "y = df[['stars']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X = scaler.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Input contains NaN, infinity or a value too large for dtype('float32').",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-109-874d6fa104bc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mrandom_forest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRandomForestClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_estimators\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_depth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mrandom_forest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0my_predict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrandom_forest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/projects/oreilly-graph-algorithms-book/a/lib/python3.6/site-packages/sklearn/ensemble/forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    245\u001b[0m         \"\"\"\n\u001b[1;32m    246\u001b[0m         \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 247\u001b[0;31m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    248\u001b[0m         \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    249\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/projects/oreilly-graph-algorithms-book/a/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m    451\u001b[0m                              % (array.ndim, estimator_name))\n\u001b[1;32m    452\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 453\u001b[0;31m             \u001b[0m_assert_all_finite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    454\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    455\u001b[0m     \u001b[0mshape_repr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_shape_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/projects/oreilly-graph-algorithms-book/a/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X)\u001b[0m\n\u001b[1;32m     42\u001b[0m             and not np.isfinite(X).all()):\n\u001b[1;32m     43\u001b[0m         raise ValueError(\"Input contains NaN, infinity\"\n\u001b[0;32m---> 44\u001b[0;31m                          \" or a value too large for %r.\" % X.dtype)\n\u001b[0m\u001b[1;32m     45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float32')."
     ]
    }
   ],
   "source": [
    "import time\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import precision_score\n",
    "from sklearn.metrics import recall_score\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "t0 = time.clock()\n",
    "\n",
    "random_forest = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=1)\n",
    "random_forest.fit(X_train, y_train.values.ravel())\n",
    "\n",
    "y_predict = random_forest.predict(X_test)\n",
    "\n",
    "display(accuracy_score(y_test, y_predict))\n",
    "display(precision_score(y_test, y_predict, average=\"binary\"))\n",
    "display(recall_score(y_test, y_predict, average=\"binary\"))\n",
    "\n",
    "run_time = time.clock() - t0\n",
    "print('RFC max-depth=10 and n-estimator=30 run in %.3f s' % run_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predict</th>\n",
       "      <th>actual</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>172</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>136</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>175</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     predict  actual\n",
       "44      True    True\n",
       "172     True    True\n",
       "163    False    True\n",
       "35     False    True\n",
       "136     True    True\n",
       "11      True    True\n",
       "123     True    True\n",
       "82      True    True\n",
       "175     True   False\n",
       "102     True    True"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "predictions_df = pd.DataFrame({\"predict\": y_predict, \"actual\": y_test[\"stars\"]})\n",
    "display(predictions_df.head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "userFriends 0.3543850096691566\n",
      "aveOtherStars 0.1958516374111908\n",
      "aveMyStars 0.44976335291965264\n"
     ]
    }
   ],
   "source": [
    "for score, feature in zip(random_forest.feature_importances_, columns):\n",
    "    print(feature, score)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
